getwd()
## [1] "C:/Users/laura.spencer/Work/red-king_RNASeq-2022/notebooks"

Load libraries and source scripts

source("../references/biostats.R")

list.of.packages <- c("DESeq2", "RCurl", "tidyverse", "vegan", "pheatmap", "pastecs", "factoextra", "FactoMineR", "RColorBrewer", "tibble", "reshape2", "plotly", "cowplot", "clipr", "janitor", "ggpubr", "forcats") #add new libraries here 
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

# Install DESeq2
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

#BiocManager::install("DESeq2")
require(DESeq2)

Load counts and sample info

load(file = "../results/gene-counts") #object = counts
load(file="../data/sample.info") #object = sample.info
load(file = "../results/gene-counts-trans") #object = counts.t

Optional

Pre-filtering - remove rows (genes) with less than a total of 10 reads (across all samples)

NOTE: As of January 19th, 2022 I will not remove any low-count genes.

keep <- colSums(counts.t) >= 10
counts.ts <- counts.t[,keep]

print(paste("# genes remaining after pre-filtering:", ncol(counts.ts)))
## [1] "# genes remaining after pre-filtering: 22460"
print(paste("# of genes dropped:", ncol(counts.t) - ncol(counts.ts), sep=" "))
## [1] "# of genes dropped: 5827"

Summary stats

# What's the average no. of genes per sample? 
data.frame(rowSums(counts.t != 0)) %>% 
                  dplyr::rename(count.total = 1) %>% 
                  rownames_to_column(var="sample") %>% 
  summarise(mean=mean(count.total), sd=sd(count.total), se=sd/sqrt(length(sample)),
            min=min(count.total), max=max(count.total))
##       mean       sd       se   min   max
## 1 20549.33 362.5444 55.94181 19547 21210
# No. of reads per sample? 
data.frame(rowSums(counts.t)) %>% 
                  dplyr::rename(read.total = 1) %>% 
                  rownames_to_column(var="sample") %>% #summary() 
summarise(mean=mean(read.total), sd=sd(read.total), se=sd/sqrt(length(sample))) #use this to average across all samples 
##       mean      sd       se
## 1 14227057 2171890 335129.9

How many genes were identified in each sample?

ggplotly(
ggplot(data = data.frame(rowSums(counts.t != 0)) %>% 
                  dplyr::rename(count.total = 1) %>% 
                  rownames_to_column(var="sample")) +
           geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total # genes by sample") + 
             theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())) 

Use foa.plots to visualize data a bit:

In how many samples does each gene occur?

  • The first four plots portray the gene’ frequency of occurrence among samples in a number of different ways – either as an empirical cumulative distribution function (ECDF) of gene occurrence or as a histogram of gene occurrence.

What is the mean abundance of each gene when it occurs (not averaging zeros for samples where it is absent)?

  • The fifth plot is an ECDF of gene mean abundance. X-axis is samples, ranked from 1-n in terms of mean gene abundance.

Is the mean abundance of genes correlated with the number of samples they occur in?

  • The sixth plot is a scatter plot of frequency of occurrence against mean abundance. Is there any apparent relationship between the two? Are the widespread genes also generally more abundant? Are there many widespread genes that occur at low abundance? Conversely, are there genes much less widespread, but abundant where they occur?

Is the total abundance of gene in a sample correlated with the number of gene in a sample? To answer this question, first it is instructive to look at the number of gene per sample.

  • The eighth plot depicts the ECDF of sample richness. Are there any interesting patterns? For example, do most samples support an average number of gene, while only a few samples supporting either very few or very many gene? Or is the pattern different?

Second, what is the pattern in the distribution of sample total abundance?

  • The ninth plot is the ECDF of total sample abundance. How does it compare to the ECDF of sample richness?

Finally, to answer the question on the relation between total abundance and number of gene/sample …

  • The last plot is a scatter plot of the two variables. Is there is relationship between the number of genes per sample and the total abundance? Do gene-rich samples generally have a greater total abundance of those genes as well?
# note: you'll need to press return in the console for all plots 
#foa.plots(counts.ts)

Merge sample key info to count data, then sort, and generate heat map for initial inspection by treatment

# merge count data with sample key, reset row names as sample names, and arrange by infection, then temperature, then day 
counts.tk <- merge(x=sample.info[,c("Sample", "Tank", "Treatment", "Treatment_Tank")], by.x="Sample", y=counts.ts, by.y="row.names") %>% 
  arrange(Treatment, Tank)  %>% column_to_rownames(var="Sample") %>% droplevels()

head(counts.tk[1:24]) #check out results of merge/arrange
##               Tank Treatment Treatment_Tank evm.TU.HiC_scaffold_97.1
## Tank_1_Crab_1    1   Ambient      Ambient.1                       11
## Tank_1_Crab_2    1   Ambient      Ambient.1                       12
## Tank_1_Crab_3    1   Ambient      Ambient.1                       16
## Tank_2_Crab_1    2   Ambient      Ambient.2                        8
## Tank_2_Crab_2    2   Ambient      Ambient.2                       21
## Tank_2_Crab_3    2   Ambient      Ambient.2                       37
##               evm.TU.HiC_scaffold_86.1 evm.TU.HiC_scaffold_86.2
## Tank_1_Crab_1                        3                        1
## Tank_1_Crab_2                        2                        0
## Tank_1_Crab_3                        4                        1
## Tank_2_Crab_1                        8                        1
## Tank_2_Crab_2                        0                        3
## Tank_2_Crab_3                        6                        2
##               evm.TU.HiC_scaffold_42.1 evm.TU.HiC_scaffold_22.1
## Tank_1_Crab_1                     3794                        2
## Tank_1_Crab_2                     4720                        4
## Tank_1_Crab_3                     6307                        8
## Tank_2_Crab_1                     4243                        4
## Tank_2_Crab_2                     6527                        1
## Tank_2_Crab_3                     4750                        0
##               evm.TU.HiC_scaffold_22.2 evm.TU.HiC_scaffold_29.1
## Tank_1_Crab_1                       53                      545
## Tank_1_Crab_2                      112                      609
## Tank_1_Crab_3                      146                      727
## Tank_2_Crab_1                       67                      980
## Tank_2_Crab_2                       48                      876
## Tank_2_Crab_3                       23                      846
##               evm.TU.HiC_scaffold_28.2 evm.TU.HiC_scaffold_50.1
## Tank_1_Crab_1                        7                      351
## Tank_1_Crab_2                        5                      330
## Tank_1_Crab_3                        7                      443
## Tank_2_Crab_1                       25                      424
## Tank_2_Crab_2                       20                      555
## Tank_2_Crab_3                       28                      314
##               evm.TU.HiC_scaffold_50.2 evm.TU.HiC_scaffold_92.1
## Tank_1_Crab_1                      721                      155
## Tank_1_Crab_2                      474                      235
## Tank_1_Crab_3                      375                      339
## Tank_2_Crab_1                      389                      120
## Tank_2_Crab_2                      768                       46
## Tank_2_Crab_3                      767                       88
##               evm.TU.HiC_scaffold_92.2 evm.TU.HiC_scaffold_92.3
## Tank_1_Crab_1                       54                       26
## Tank_1_Crab_2                       56                       53
## Tank_1_Crab_3                       55                      154
## Tank_2_Crab_1                       74                       41
## Tank_2_Crab_2                       60                       10
## Tank_2_Crab_3                       73                       20
##               evm.TU.HiC_scaffold_92.4 evm.TU.HiC_scaffold_103.1
## Tank_1_Crab_1                        5                         6
## Tank_1_Crab_2                       14                        11
## Tank_1_Crab_3                       29                        16
## Tank_2_Crab_1                        2                        14
## Tank_2_Crab_2                        5                        16
## Tank_2_Crab_3                        3                        14
##               evm.TU.HiC_scaffold_102.1 evm.TU.HiC_scaffold_29.2
## Tank_1_Crab_1                        20                     4142
## Tank_1_Crab_2                        16                     6019
## Tank_1_Crab_3                        27                     6202
## Tank_2_Crab_1                        26                     5295
## Tank_2_Crab_2                        34                     3604
## Tank_2_Crab_3                        23                     4752
##               evm.TU.HiC_scaffold_29.3 evm.TU.HiC_scaffold_29.4
## Tank_1_Crab_1                       53                       11
## Tank_1_Crab_2                       27                       15
## Tank_1_Crab_3                       28                        6
## Tank_2_Crab_1                       97                       47
## Tank_2_Crab_2                       53                       14
## Tank_2_Crab_3                       70                       23
##               evm.TU.HiC_scaffold_29.5 evm.TU.HiC_scaffold_78.1
## Tank_1_Crab_1                        0                     1483
## Tank_1_Crab_2                        0                     1224
## Tank_1_Crab_3                        0                     1075
## Tank_2_Crab_1                        1                     1904
## Tank_2_Crab_2                        4                     1270
## Tank_2_Crab_3                        0                     1899
#counts.tk %>% dplyr::select(starts_with("evm")) #this is code to get only the gene columns 

Generate heat map of counts before DESeq processing / analysis

NOTE: scale=“column” b/c range of counts is so huge, so counts have been scaled

pheatmap(data.matrix(counts.tk %>% dplyr::select(starts_with("evm"))), Rowv=NA, Colv=NA, na.rm = TRUE, xlab = NA, 
                     show_colnames =FALSE, cluster_cols = FALSE, cluster_rows = TRUE, 
                     scale="column", color=c("dodgerblue3", "goldenrod1"), 
                     main = "RKC gene counts", annotation_row=counts.tk[,c("Treatment", "Tank")])

Analysis in DESeq2

Reformat for DESeq, ensure correct sample order for

NOTE: It is absolutely critical that the columns of the count matrix and the rows of the column data (information about samples) are in the same order. DESeq2 will not make guesses as to which column of the count matrix belongs to which row of the column data, these must be provided to DESeq2 already in consistent order.

all(rownames(counts.tk) == counts.tk %>% dplyr::select(starts_with("evm")) %>% t() %>% colnames()) #check that rownames of untransformed matrix match column names of transformed matrix. Should print 'TRUE' 
## [1] TRUE

Generate DESeq datasets with various treatment comparisons

dds.pH <- DESeqDataSetFromMatrix(countData = counts.tk[,grepl("evm", colnames(counts.tk))] %>% t(),
                              colData = counts.tk[,"Treatment", drop=FALSE] ,
                              design = ~ Treatment)

dds.tank <- DESeqDataSetFromMatrix(countData = counts.tk[,grepl("evm", colnames(counts.tk))] %>% t(),
                              colData = counts.tk[,"Tank", drop=FALSE] ,
                              design = ~ Tank)

dds.treat.tank <- DESeqDataSetFromMatrix(countData = counts.tk[,grepl("evm", colnames(counts.tk))] %>% t(),
                              colData = counts.tk[,"Treatment_Tank", drop=FALSE] ,
                              design = ~ Treatment_Tank)

Visualize data via PCAs and heat maps

HERE: REDO PCA USING CODE FROM MULTIVARIATE STATS

Transform data

  • Here we transform counts using a variance stabilizing transformation (VST), since we have many samples.
  • Here we use blind=FALSE b/c we are interested in differences explained by experimental design, and may wish to use this transformed data in downstream analyses.
vsd.pH <- varianceStabilizingTransformation(dds.pH, blind=FALSE)
vsd.tank <- varianceStabilizingTransformation(dds.tank, blind=FALSE)
vsd.treat.tank <- varianceStabilizingTransformation(dds.treat.tank, blind=FALSE)

Visualize sample clustering via PCA (after transformation)

NOTE: Hover over points to see the sample numbers

# PCA with points color coded by pH Treatment 
ggplotly(
  plotPCA(vsd.pH, intgroup="Treatment") + 
           ggtitle("PCA by Treatment (var-stabilizing transformed)") + 
    geom_point(size=3, aes(text=colnames(vsd.pH))) + 
    theme_minimal()+ stat_ellipse(), tooltip = "text")
## Warning: Ignoring unknown aesthetics: text
# PCA with points color coded by tank 
ggplotly(
  plotPCA(vsd.tank, intgroup="Tank") + 
           ggtitle("PCA by Tank (var-stabilizing transformed)") + 
    geom_point(size=3, aes(text=colnames(vsd.tank))) + 
   theme_minimal(), tooltip = "text")
## Warning: Ignoring unknown aesthetics: text
# PCA with points color coded by treatment + tank 
ggplotly(
  plotPCA(vsd.treat.tank, intgroup="Treatment_Tank") + 
           ggtitle("PCA by Treatment & Tank (var-stabilizing transformed)") + 
    geom_point(size=3, aes(text=colnames(vsd.treat.tank))) + 
   theme_minimal(), tooltip = "text")
## Warning: Ignoring unknown aesthetics: text
# Ambient pH (8.0) tanks:  1,2,7,10,11
# Moderate pH (7.8) tanks: 3,4,9,20
# Low pH (7.5) tanks: 5,13,15,16,18

PCA.data.pH <- plotPCA(vsd.pH, intgroup=c("Treatment"), returnData=TRUE)
save(PCA.data.pH, file="../results/PCA.data.pH")

Generate heat maps before & after transformation

# extract treatment info from VSD transformation 
#vsd.df <- as.data.frame(colData(vsd)[,c("population", "pCO2.parent")])

# generate heatmap from untransformed counts 
#pheatmap(counts(dds), cluster_rows=FALSE, show_rownames=FALSE,
#         cluster_cols=T, annotation_col=vsd.df, scale = "row", main="QuantSeq, untransformed data (but scaled by rows")

# generate heatmap from VSD counts 
#pheatmap(assay(vsd), cluster_rows=FALSE, show_rownames=FALSE,
#         cluster_cols=T, annotation_col=vsd.df, main = "QuantSeq, VSD-transformed")

Heatmap of the sample-to-sample distances

Another use of the transformed data is sample clustering. Here, we apply the dist function to the transpose of the transformed count matrix to get sample-to-sample distances.

A heatmap of this distance matrix gives us an overview over similarities and dissimilarities between samples. We have to provide a hierarchical clustering hc to the heatmap function based on the sample distances, or else the heatmap function would calculate a clustering based on the distances between the rows/columns of the distance matrix.

colors <- colorRampPalette(rev(brewer.pal(9, "Blues")) )(255)

sampleDistss <- dist(t(assay(vsd.pH)))
sampleDistMatrixs <- as.matrix(sampleDistss)

# Here we only show pH treatment 
rownames(sampleDistMatrixs) <- vsd.pH$Treatment #set row names 
colnames(sampleDistMatrixs) <- NULL
pheatmap(sampleDistMatrixs,
         clustering_distance_rows=sampleDistss,
         clustering_distance_cols=sampleDistss,
         col=rev(colors), fontsize = 6)

# Here we also show tank number 
sampleDistss <- dist(t(assay(vsd.treat.tank)))
sampleDistMatrixs <- as.matrix(sampleDistss)

rownames(sampleDistMatrixs) <- vsd.treat.tank$Treatment_Tank #set row names 
colnames(sampleDistMatrixs) <- NULL
pheatmap(sampleDistMatrixs,
         clustering_distance_rows=sampleDistss,
         clustering_distance_cols=sampleDistss,
         col=rev(colors), fontsize = 6)

Differential Expression Analysis - multifactor design

Run the function DESeq to assess differential expression

dds.DESeq.pH <- DESeq(dds.pH) 
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 86 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

Any DEGs between pH treatment?

#colData(dds.DESeq.pH)
print("Comparison: Ambient pH vs. Moderate pH")
## [1] "Comparison: Ambient pH vs. Moderate pH"
summary(res.pco2.AM <- results(dds.DESeq.pH, contrast=c("Treatment", "Ambient", "Moderate"), alpha=0.05))
## 
## out of 22460 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 603, 2.7%
## LFC < 0 (down)     : 227, 1%
## outliers [1]       : 0, 0%
## low counts [2]     : 7403, 33%
## (mean count < 13)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, Ambient vs. Moderate:",  sum(res.pco2.AM$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, Ambient vs. Moderate: 830"
print("Comparison: Ambient pH vs. Low pH")
## [1] "Comparison: Ambient pH vs. Low pH"
summary(res.pco2.AL <- results(dds.DESeq.pH, contrast=c("Treatment", "Ambient", "Low"), alpha=0.05))
## 
## out of 22460 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1067, 4.8%
## LFC < 0 (down)     : 430, 1.9%
## outliers [1]       : 0, 0%
## low counts [2]     : 6967, 31%
## (mean count < 11)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, Ambient vs. Low:",  sum(res.pco2.AL$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, Ambient vs. Low: 1497"
print("Comparison: Moderate pH vs. Low pH")
## [1] "Comparison: Moderate pH vs. Low pH"
summary(res.pco2.ML <- results(dds.DESeq.pH, contrast=c("Treatment", "Moderate", "Low"), alpha=0.05))
## 
## out of 22460 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 12, 0.053%
## LFC < 0 (down)     : 5, 0.022%
## outliers [1]       : 0, 0%
## low counts [2]     : 9580, 43%
## (mean count < 27)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, Moderate vs. Low:",  sum(res.pco2.ML$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, Moderate vs. Low: 17"

Create heatmaps with DEGs

#Subset the DESeq results for only those statistically sign. ones 
diffex.AM <- subset(res.pco2.AM, padj < 0.05)
diffex.AL <- subset(res.pco2.AL, padj < 0.05)
diffex.ML <- subset(res.pco2.ML, padj < 0.05)

### Generate counts matrix With DEGs among any pH treatment
diffex.pH.counts <- subset(counts(dds.DESeq.pH), rownames(dds.DESeq.pH) %in% unique(c(rownames(diffex.AM),rownames(diffex.AL),rownames(diffex.ML))))

# Create a dataframe to annotate heatmap with treatments 
dds.pH.df <- sample.info[match(colnames(diffex.pH.counts), sample.info$Sample),c("Sample", "Treatment", "Tank")] %>%
  remove_rownames() %>% column_to_rownames(var = "Sample")

# Make sure treatment order is correct
all(colnames(diffex.pH.counts) == rownames(dds.pH.df)) #double check that samples are still in same order 
## [1] TRUE
# All DEGs among any pH treatment 
pheatmap(diffex.pH.counts, cluster_rows=TRUE, cluster_cols=FALSE,
         show_rownames=FALSE, na.rm=TRUE, annotation_colors = list(Treatment=c(Ambient="#2c7bb6", Moderate="#fdae61", Low="#d7191c")),
         scale="row", main = "Differentially expressed genes (DEGs) among any pH treatment", 
         annotation_col=dds.pH.df[,"Treatment", drop=FALSE], fontsize = 8)

# # another way to do a heatmap 
# heatmap(diffex.pH.counts, Colv = NA, scale="row", ColSideColors = brewer.pal(3, "Set1")[dds.pH.df$Treatment])
### ----- Generate pairwise comparison heatmaps 

# Create count matrices containing DEGs for each pairwise treatment comparison, rows (genes) ordered by pvalue, 

diffex.AM.counts <- subset(counts(dds.DESeq.pH), rownames(dds.DESeq.pH) %in% rownames(diffex.AM)) %>% as.data.frame() %>%
  dplyr::select((sample.info %>% arrange(Treatment) %>% 
                   filter(Treatment=="Ambient" | Treatment== "Moderate"))$Sample) %>% 
  rownames_to_column(var = "gene") %>% arrange(match(gene, c(as.data.frame(diffex.AM) %>% arrange(padj) %>% rownames()))) %>%
  column_to_rownames(var = "gene") %>% as.matrix()
  
diffex.AL.counts <- subset(counts(dds.DESeq.pH), rownames(dds.DESeq.pH) %in% rownames(diffex.AL)) %>% as.data.frame() %>%
  dplyr::select((sample.info %>% arrange(Treatment) %>% 
                   filter(Treatment=="Ambient" | Treatment== "Low"))$Sample) %>% 
  rownames_to_column(var = "gene") %>% arrange(match(gene, c(as.data.frame(diffex.AL) %>% arrange(padj) %>% rownames()))) %>% 
  column_to_rownames(var = "gene") %>% as.matrix()

diffex.ML.counts <- subset(counts(dds.DESeq.pH), rownames(dds.DESeq.pH) %in% rownames(diffex.ML)) %>% as.data.frame() %>%
  dplyr::select((sample.info %>% arrange(Treatment) %>% 
                   filter(Treatment=="Moderate" | Treatment== "Low"))$Sample) %>% 
  rownames_to_column(var = "gene") %>%
  arrange(match(gene, c(as.data.frame(diffex.ML) %>% arrange(padj) %>% rownames()))) %>% 
  column_to_rownames(var = "gene") %>% as.matrix()


pheatmap(diffex.AM.counts, cluster_cols = FALSE, cluster_rows=TRUE, 
         show_rownames=FALSE, na.rm=TRUE, scale="row", 
         main = "DEGs among pH 8.0 (ambient) & pH 7.8 (moderate), sorted by p-value",
         fontsize = 8, cutree_rows = 2, 
         annotation_colors = list(Treatment=c(Ambient="#2c7bb6", Moderate="#fdae61")),
         annotation_col=dds.pH.df[colnames(diffex.AM.counts),"Treatment", drop=FALSE])

pheatmap(diffex.AL.counts, cluster_cols = FALSE, cluster_rows=TRUE, 
         show_rownames=FALSE, na.rm=TRUE, scale="row", 
         main = "DEGs among pH 8.0 (ambient) & pH 7.5 (low), sorted by p-value",
         fontsize = 8, cutree_rows = 2,
         annotation_colors = list(Treatment=c(Ambient="#2c7bb6", Low="#d7191c")),
         annotation_col=dds.pH.df[colnames(diffex.AL.counts),"Treatment", drop=FALSE])

pheatmap(diffex.ML.counts, cluster_cols = FALSE, cluster_rows=TRUE, 
         show_rownames=TRUE, na.rm=TRUE, scale="row", 
         main = "DEGs among pH 7.8 (moderate) & pH 7.5 (low), sorted by p-value",
         fontsize = 8, cutree_rows = 2, 
         annotation_colors = list(Treatment=c(Moderate="#fdae61", Low="#d7191c")),
         annotation_col=dds.pH.df[colnames(diffex.ML.counts),"Treatment", drop=FALSE])

Enrichment analysis

Here I will do a quick enrichment analysis using the online tool DAVID. I copy the Uniprot SPIDs for the annotated DEGs and all annotated genes examined into the DAVID Functional Annotation Tools. I do so for each pairwise pH comparison.

# DEGs between Ambient & Moderate (includes 733 annotated genes, when outlier removed it's 344)
diffex.AM %>% as.data.frame() %>% rownames_to_column("gene") %>% dplyr::select(gene, padj) %>% 
  left_join(P.plat.blast.GO %>% dplyr::select(geneID, SPID, `Protein names`), by = c("gene"="geneID")) %>% 
  dplyr::select(SPID) %>%  na.omit() %>% unlist() %>% as.vector() %>% write_clip()


# DEGs between Ambient & Low (includes 967 annotated genes, when outlier removed it's 679)
diffex.AL %>% as.data.frame() %>% rownames_to_column("gene") %>% dplyr::select(gene, padj) %>% 
  left_join(P.plat.blast.GO %>% dplyr::select(geneID, SPID, `Protein names`), by = c("gene"="geneID")) %>% 
  dplyr::select(SPID) %>%  na.omit() %>% unlist() %>% as.vector() %>% write_clip()


# DEGs between Moderate & Low (includes just 2 annotated genes, 3 when outlier removed) <----- None of these were registered by DAVID 
diffex.ML %>% as.data.frame() %>% rownames_to_column("gene") %>% dplyr::select(gene, padj) %>% 
  left_join(P.plat.blast.GO %>% dplyr::select(geneID, SPID, `Protein names`), by = c("gene"="geneID")) %>% 
  dplyr::select(SPID) %>%  na.omit() %>% unlist() %>% as.vector() %>% write_clip()

# All genes (background, includes 7,238 annotated genes)
res.pco2.AM %>% as.data.frame() %>% rownames_to_column("gene") %>% dplyr::select(gene, padj) %>% 
  left_join(P.plat.blast.GO %>% dplyr::select(geneID, SPID, `Protein names`), by = c("gene"="geneID")) %>%
  dplyr::select(SPID) %>%  na.omit() %>% unlist() %>% as.vector() %>% write_clip()

DAVID Results

Enriched biological processes (and associated gene lists) are saved to GitHub: – Ambient pH vs. Moderate pH, results/DAVID-Enriched-BP_Amb-Mod.txt
– Ambient pH vs. Moderate pH, results/DAVID-Enriched-BP_Amb-Low.txt
– Moderate pH vs. Low pH, NONE

Updated Enriched biological processes with outlier sample removed (Tank 7 Crab 4) – Ambient pH vs. Moderate pH, results/DAVID-Enriched-BP_Amb-Mod_no-outlier-T7C4.txt
– Ambient pH vs. Moderate pH, results/DAVID-Enriched-BP_Amb-Low_no-outlier-T7C4.txt
– Moderate pH vs. Low pH, NONE

Visualize with REVIGO - read in DAVID enrichment results, extract GO terms and PValues and copy to clipboard to paste into REVIGO

# With all samples 
read_delim(file="../results/DAVID-Enriched-BP_Amb-Mod.txt", delim = "\t") %>% 
  mutate(GO = str_extract(Term, "GO(.*?)~")) %>% 
  mutate(GO = gsub("~", "", GO)) %>% dplyr::select(GO, PValue) %>% na.omit() %>% write_clip()

read_delim(file="../results/DAVID-Enriched-BP_Amb-Low.txt", delim = "\t") %>% 
  mutate(GO = str_extract(Term, "GO(.*?)~")) %>% 
  mutate(GO = gsub("~", "", GO)) %>% dplyr::select(GO, PValue) %>% na.omit() %>% write_clip()

# With outlier sample (Tank 7 Crab 4) removed 
# Between ambient and moderate pH 
read_delim(file="../results/DAVID-Enriched-BP_Amb-Mod_no-outlier-T7C4.txt", delim = "\t") %>% 
  mutate(GO = str_extract(Term, "GO(.*?)~")) %>% 
  mutate(GO = gsub("~", "", GO)) %>% dplyr::select(GO, PValue) %>% na.omit() %>% write_clip()

# Between ambient and low pH 
read_delim(file="../results/DAVID-Enriched-BP_Amb-Low_no-outlier-T7C4.txt", delim = "\t") %>% 
  mutate(GO = str_extract(Term, "GO(.*?)~")) %>% 
  mutate(GO = gsub("~", "", GO)) %>% dplyr::select(GO, PValue) %>% na.omit() %>% write_clip()